Example report:

                          Number of input reads |   93371899
                      Average input read length |   152
                                    UNIQUE READS:
                   Uniquely mapped reads number |   58118693
                        Uniquely mapped reads % |   62.24%
                          Average mapped length |   133.52
                       Number of splices: Total |   9287778
            Number of splices: Annotated (sjdb) |   0
                       Number of splices: GT/AG |   9198064
                       Number of splices: GC/AG |   62090
                       Number of splices: AT/AC |   5194
               Number of splices: Non-canonical |   22430
                      Mismatch rate per base, % |   2.03%
                         Deletion rate per base |   0.01%
                        Deletion average length |   1.43
                        Insertion rate per base |   0.00%
                       Insertion average length |   1.43
                             MULTI-MAPPING READS:
        Number of reads mapped to multiple loci |   2779036
             % of reads mapped to multiple loci |   2.98%
        Number of reads mapped to too many loci |   36707
             % of reads mapped to too many loci |   0.04%
                                  UNMAPPED READS:
  Number of reads unmapped: too many mismatches |   0
       % of reads unmapped: too many mismatches |   0.00%
            Number of reads unmapped: too short |   31910266
                 % of reads unmapped: too short |   34.18%
                Number of reads unmapped: other |   527197
                     % of reads unmapped: other |   0.56%
                                  CHIMERIC READS:
                       Number of chimeric reads |   65461
                            % of chimeric reads |   0.07%
library(ggplot2)
library(UpSetR)
setwd('~/git/recount-pump/metadata/qc_expt')

This is a subsample, taking a random 20% of the total # rows from samples.tsv. This way of sampling is flawed because it does not keep studies intact.

I think that most of the STAR-related QC measures come from the Log.final.out output file.

m <- read.table('samples.sorted_by_study.tsv', sep='\t', comment.char='', quote='', header=T)
frac_non <- m$star.number_of_splices._non.canonical / m$star.number_of_splices._total
frac_non_thresh <- frac_non > 0.4
ggplot(m, aes(x=1:nrow(m), y=frac_non, color=frac_non > 0.4, alpha=I(0.2))) +
  geom_point() +
  theme_bw()
## Warning: Removed 514 rows containing missing values (geom_point).

frac_non_samps <- which(frac_non_thresh)
ggplot(m, aes(y=frac_non)) + geom_boxplot() + theme_bw()
## Warning: Removed 514 rows containing non-finite values (stat_boxplot).

short_frac <- m$star.number_of_reads_unmapped._too_short / m$star.number_of_input_reads
short_frac_thresh <- short_frac > 0.8
ggplot(m, aes(x=1:nrow(m), y=short_frac, color=short_frac > 0.8, alpha=I(0.05))) +
  geom_point() +
  theme_bw()

short_frac_samps <- which(short_frac_thresh)
assigned_frac <- m$gene_fc_count_all.assigned/m$gene_fc_count_all.total
assigned_frac_thresh <- assigned_frac < 0.05
ggplot(m, aes(x=1:nrow(m), y=assigned_frac, color=assigned_frac < 0.05, alpha=I(0.05))) +
  geom_point() + theme_bw() + labs(x='Index', y='% assigned to gene')
## Warning: Removed 144 rows containing missing values (geom_point).

assigned_frac_samps <- which(assigned_frac_thresh)
nreads <- (m$star.number_of_input_reads + m$star.number_of_input_reads2)
nmapped <- (m$star.all_mapped_reads + m$star.all_mapped_reads2)
mapped <- nmapped / nreads
mapped_thresh <- mapped < 0.01
mapped_samps <- which(mapped_thresh)
ggplot(m, aes(x=1:nrow(m), y=mapped, color=mapped < 0.01, alpha=I(0.5))) +
  geom_point() + theme_bw() + labs(x='Index', y='% mapped')

nrep <- (m$star.number_of_reads_mapped_to_multiple_loci + m$star.number_of_reads_mapped_to_multiple_loci2)
repal <- nrep / nmapped
repal_thresh <- repal > 0.85
repal_samps <- which(repal_thresh)
ggplot(m, aes(x=1:nrow(m), y=repal, color=repal > 0.85, alpha=I(0.5))) +
  geom_point() + theme_bw() + labs(x='Index', y='Fraction of aligned reads that aligned repetitively')
## Warning: Removed 141 rows containing missing values (geom_point).

mt <- m$aligned_reads..chrm
mt_thresh <- mt > 40
mt_samps <- which(mt_thresh)
ggplot(m, aes(x=1:nrow(m), y=mt, color=mt > 40, alpha=I(0.5))) +
  geom_point() + theme_bw() + labs(x='Index', y='% mapped to MT')

few <- m$star.number_of_input_reads + m$star.number_of_input_reads2
few_thresh <- few < 10000
few_samps <- which(few_thresh)
ggplot(m, aes(x=1:nrow(m), y=few, color=few < 10000, alpha=I(0.5))) +
  geom_point() + theme_bw() + labs(x='Index', y='# reads')

# percent
pct_chim <- m$star.._of_chimeric_reads
pct_chim_thresh <- pct_chim > 20
ggplot(m, aes(x=1:nrow(m), y=pct_chim, color=pct_chim > 20, alpha=I(0.5))) +
  geom_point() + theme_bw() + labs(x='Index', y='% chimeric')

pct_chim_samps <- which(pct_chim_thresh)
read_len <- m$star.average_input_read_length
read_len_thresh <- read_len <= 30
ggplot(m, aes(x=1:nrow(m), y=read_len, color=read_len <= 30, alpha=I(0.5))) +
  geom_point() +
  theme_bw()

read_len_samps <- which(read_len_thresh)
table(m$study_acc[read_len < 27])
## 
## DRP002623 ERP021735 SRP000762 SRP002915 SRP003726 SRP004042 SRP004847 SRP004965 
##         7         1         7        27        14         3         8         3 
## SRP005279 SRP008218 SRP009247 SRP014830 SRP017786 SRP029889 SRP032279 SRP039684 
##        36        14        14         1         6        25       119         3 
## SRP041993 SRP055513 SRP058476 SRP061466 SRP065282 SRP068551 SRP098746 SRP100634 
##         3       111         4        16         1        18         2        21 
## SRP103771 SRP104148 SRP108951 SRP115598 SRP117282 SRP119821 SRP125173 SRP130953 
##         2         1        16         4         1         1         1         3 
## SRP132520 SRP137150 SRP140829 SRP168237 SRP170234 SRP183467 SRP188526 SRP189162 
##         4         1         6         4        10        18        80        16 
## SRP193787 SRP198932 SRP199640 SRP200058 SRP218935 
##        12         2        12         8         1
indel <- m$star.deletion_rate_per_base + m$star.insertion_rate_per_base
indel_thresh <- indel > 0.1
ggplot(m, aes(x=1:nrow(m), y=indel, color=indel > 0.1)) +
  geom_point() + theme_bw() + labs(x='Index', y='Insertion + deletion rate')

indel_samps <- which(indel_thresh)

Trying to identify who that green tower is.

table(m$study_acc[indel > 2])
## 
## SRP061888 SRP073426 SRP166108 
##         8        12        70

I was expecting the mismatch rate per-base measure to be a fraction, but I’m clearly misunderstanding. This one needs more work.

mm_rate <- m$star.mismatch_rate_per_base._.
mm_rate_mean <- mean(mm_rate)
mm_rate_sd <- sd(mm_rate)
mm_rate_thresh <- mm_rate > mm_rate_mean + (2*mm_rate_sd)
ggplot(m, aes(x=1:nrow(m), y=mm_rate, color=mm_rate > mm_rate_mean + (2*mm_rate_sd), alpha=I(0.3))) +
  geom_point() + theme_bw() + labs(x='Index', y='Mismatch rate (%) per base')

mm_rate_samps <- which(mm_rate_thresh)

Why is this ever negative?

sc_avg <- (m$star.average_input_read_length + m$star.average_input_read_length2 - m$star.average_mapped_length - m$star.average_mapped_length2) / (m$star.average_input_read_length + m$star.average_input_read_length2)
sc_avg_thresh <- sc_avg > 0.15
ggplot(m, aes(x=1:nrow(m), y=sc_avg, color=sc_avg > 0.15)) +
  geom_point() + theme_bw() + labs(x='Index', y='Avg proportion soft clipped')

sc_avg_samps <- which(sc_avg_thresh)
table(m$study_acc[sc_avg > 0.95])
## 
## DRP000499 ERP021420 SRP001371 SRP028887 SRP035934 SRP040110 SRP064863 SRP073813 
##         1         1         1         4         2         1        11         1 
## SRP093667 SRP100634 SRP114956 SRP128731 SRP144016 SRP150087 SRP153798 SRP155564 
##         1        21         1         1         2         1         2         2 
## SRP168237 SRP170234 SRP189162 SRP193787 SRP194624 SRP199640 SRP223635 
##         4        10        16        12        33        12        15
other <- m$star.number_of_reads_unmapped._other / m$star.number_of_input_reads
other_thresh <- other > 0.1
other_samps <- which(other_thresh)
ggplot(m, aes(x=1:nrow(m), y=other, color=other > 0.1)) +
  geom_point() + theme_bw() + labs(x='Index', y='Fraction reads unmapped for "other" reason')

table(m$study_acc[other_thresh])
## 
## DRP000499 DRP002623 ERP009114 ERP021735 ERP104024 ERP111307 ERP115036 ERP116477 
##         1         2         1         2        12         2        52         2 
## SRP000762 SRP001371 SRP002079 SRP002640 SRP003726 SRP004042 SRP004847 SRP004965 
##         2         1         1         3         1         1         2         1 
## SRP005177 SRP006475 SRP007417 SRP007596 SRP008218 SRP009276 SRP010279 SRP011085 
##         1         1         1         1         1         3         1         4 
## SRP012062 SRP012557 SRP013239 SRP013450 SRP013565 SRP014591 SRP017411 SRP017575 
##         1         1         1         1         1         1         1        52 
## SRP018861 SRP019272 SRP019994 SRP021191 SRP022920 SRP023262 SRP024674 SRP028170 
##         1         1        98        10         6        76         1         4 
## SRP028180 SRP029953 SRP032455 SRP033276 SRP034953 SRP036053 SRP036821 SRP040472 
##         4         4         3         9         6         7        43        73 
## SRP041833 SRP042161 SRP044956 SRP055513 SRP056802 SRP061707 SRP062144 SRP063500 
##         1         1         4         1         1         1         8         1 
## SRP067502 SRP067524 SRP068609 SRP068969 SRP073940 SRP076107 SRP081236 SRP082426 
##        11         2        11        11         1         1         3         7 
## SRP083756 SRP094435 SRP095307 SRP096338 SRP097912 SRP100634 SRP101731 SRP102077 
##         1         1         2         2         1        21         4         5 
## SRP106148 SRP107789 SRP110081 SRP111459 SRP112553 SRP113558 SRP114410 SRP114672 
##         1         9         6         1         4         1         1         1 
## SRP117905 SRP121075 SRP131142 SRP132701 SRP133108 SRP133393 SRP144189 SRP144382 
##         3         1         6         4         1         1         2         6 
## SRP144647 SRP150087 SRP150331 SRP150456 SRP151471 SRP155771 SRP157907 SRP163252 
##         2         1         2         1         1         1         2         1 
## SRP163259 SRP168237 SRP168244 SRP170234 SRP173190 SRP173497 SRP174449 SRP174485 
##        11         4        19        10         4         4        24         2 
## SRP175016 SRP183467 SRP183811 SRP186406 SRP186450 SRP186455 SRP189116 SRP189162 
##         1         6         2         2         5         1         1        16 
## SRP189867 SRP193787 SRP193940 SRP194624 SRP197074 SRP198244 SRP198803 SRP199640 
##         2        12         1        53         2        22         2        12 
## SRP219272 SRP219652 SRP221385 
##         1         2         2
juncs_per_base <- m$junction_count / (m$star.all_mapped_reads * m$star.average_mapped_length)
juncs_per_base_thresh <- juncs_per_base < 0.000001
ggplot(m, aes(x=1:nrow(m), y=juncs_per_base, color=juncs_per_base < 0.000001, alpha=I(0.1))) +
  geom_point() + theme_bw() + labs(x='Index', y='Junctions per aligned base') + ylim(0, 0.00001)
## Warning: Removed 296275 rows containing missing values (geom_point).

juncs_per_base_samps <- which(juncs_per_base_thresh)
upset(fromList(list(juncs_per_base=juncs_per_base_samps,
                    mm_rate=mm_rate_samps,
                    indel=indel_samps,
                    read_len=read_len_samps,
                    pct_chim=pct_chim_samps,
                    assigned_frac=assigned_frac_samps,
                    frac_non=frac_non_samps,
                    short_frac=short_frac_samps,
                    mapped=mapped_samps,
                    few_reads=few_samps,
                    mt_pct=mt_samps,
                    rep_al=repal_samps,
                    unmapped_other=other_samps)), nsets = 13, order.by = "freq")